suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(tidyverse)
library(tximport)
library(vsn)
library(zinbwave)
library(Rtsne)
})
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
samples <- read_csv(here("doc/samples_final.csv"))
## Rows: 48 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ScilifeID, SubmittedID, Stages, Description, ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filelist <- list.files(here("data/Salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sanity check to ensure that the data is sorted according to the sample info
filelist <- filelist[match(samples$ScilifeID,sub("_sortmerna.*","",basename(dirname(filelist))))]
stopifnot(all(match(sub("_sortmerna.*","",basename(dirname(filelist))),
samples$ScilifeID) == 1:nrow(samples)))
name the file list vector
names(filelist) <- samples$ID
Read the expression at the gene level
txi <- suppressMessages(tximport(files = filelist,
type = "salmon",
txOut=TRUE))
combine technical replicates
samples$ID <- sub("_L00[1,2]", "",
samples$ScilifeID)
counts <- round(do.call(
cbind,
lapply(split.data.frame(t(txi$counts),
samples$ID),
colSums)))
#tpm <- do.call(
# cbind,
# lapply(split.data.frame(t(txi$abundance),
# samples$ID),
# colMeans))
csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$ID),]
read the expression for the pool of lincRNAs we found
linc_read <- read_tsv("~/Git/lncRNAs/doc/lincRNAs.tsv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 62202 Columns: 146
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (29): TRINITY_ID, peak, gmap_loc, gmap_type, Exon, scaffold, t_start, t...
## dbl (111): raw_P464_201B, raw_P464_202, raw_P464_203, raw_P464_204, raw_P464...
## lgl (6): taxonomy, coding, non_coding, seidr, lincRNAs, closest_length > 1000
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
linc <- linc_read$TRINITY_ID
counts <- counts[linc, ]
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "10.8% percent (6687) of 62202 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(csamples)
ggplot(dat,aes(x,y,fill=csamples$Stages)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected, considering we have lincRNAs, caracterised by a really low signal.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 6687 rows containing non-finite values (`stat_density()`).
The same is done for the individual samples colored by Stages.
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Stages=csamples$Stages[match(ind,csamples$ID)])
ggplot(dat,aes(x=values,group=ind,col=Stages)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 1216340 rows containing non-finite values (`stat_density()`).
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_linc_new.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
csamples$Stages <- factor(csamples$Stages)
there are a lot of zeros, so we use zinbwave
se <- SummarizedExperiment(assays=list(counts=as.matrix(counts)),
colData=as.data.frame(csamples))
Remove non expressed
se <- se[rowSums(assay(se))>0,]
zinb <- zinbwave(se,K=0,epsilon=1e12,
X="~Stages",
observationalWeights=TRUE)
save(zinb,file=here("data/analysis/salmon/zinb_linc.rda"))
Check the size factors (i.e. the sequencing library size effect)
dds <- DESeqDataSet(zinb,design=~Stages)
## converting counts to integer mode
dds <- DESeq(dds,
sfType = "poscounts",
useT = TRUE,
minmu = 1e-6)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## Warning in getAndCheckWeights(object, modelMatrix, weightThreshold = weightThreshold): for 1197 row(s), the weights as supplied won't allow parameter estimation, producing a
## degenerate design matrix. These rows have been flagged in mcols(dds)$weightsFail
## and treated as if the row contained all zeros (mcols(dds)$allZero set to TRUE).
## If you are blocking for donors/organisms, consider design = ~0+donor+condition.
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
save(dds,file=here("data/analysis/salmon/dds_linc.rda"))
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
save(vst,file=here("data/analysis/DE/vst-blind_linc.rda"))
vsda <- varianceStabilizingTransformation(dds, blind=FALSE)
vsta <- assay(vsda)
vsta <- vsta - min(vsta)
save(vsta,file=here("data/analysis/DE/vst-aware_linc.rda"))
# prepare the data to build the network
#ID <- rownames(vsta)
#vsta <- cbind(ID,vsta)
#vsta_tibble <- as_tibble(vsta)
#write_tsv(vsta_tibble,path=here("data/analysis/DE/vst-aware_linc_new.tsv"))
#thing <- read_tsv(here("data/analysis/DE/vst-aware_linc.tsv"))
#library(Biostrings)
#seq <- readDNAStringSet("data/trinity/Trinity.fasta")
#names(seq) <- sub(" .*","",names(seq))
#IDs <- thing$ID
#writeXStringSet(seq[IDs],file="~/Git/lncRNAs/data/analysis/DE/linc_all.fasta")
meanSdPlot(log2(counts(dds)[!mcols(dds)$allZero,]+1))
meanSdPlot(log2(assay(zinb)+1))
meanSdPlot(vst[rowSums(vst)>0,])
meanSdPlot(vsta[rowSums(vsta)>0,])
#load("data/analysis/DE/vst-aware_linc.rda")
#load("data/analysis/salmon/dds_linc.rda")
pc <- prcomp(t(vsta))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=1
And the number of possible combinations
nlevel=nlevels(dds$Stages)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
csamples)
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Stages,text=dds$ID)) +
geom_point(size=2) +
theme_classic() +
ggtitle("Principal Component Analysis lincRNAs") +
#,subtitle="variance stabilized counts"
labs(x=paste("PC1 (",percent[1],"%)",sep=""),
y=paste("PC2 (",percent[2],"%)",sep="")) +
theme(text=element_text(size=12)) +
#theme(plot.title=element_text(size=10)) +
theme(plot.title = element_text(face = "bold"))
p
plot(p + labs(x=paste("PC1 (",percent[1],"%)",sep=""),
y=paste("PC2 (",percent[2],"%)",sep="")))
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise
conds <- factor(csamples$Stages)
sels <- rangeFeatureSelect(counts=vsta,
conditions=conds,
nrep=3)
vst.cutoff <- 2
sel_linc_new <- vsta[sels[[vst.cutoff + 1]],]
mar <- par("mar")
par(mar=c(0.05,0.05,0.05,0.05))
hm <- heatmap.2(t(scale(t(vsta[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds)
# The Biological QA is good, considering it's based on lincRNAs. We have no outliers.
# The sequencing depth decreased comparing to the previous analysis.
# Looking at the PCA, it could be interesting to do DE analysis to see in particular what's going on
# in S3 and S6. I consider those two stages relevant, because I think things are changes here.
#
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] Rtsne_0.16 zinbwave_1.22.0
## [3] SingleCellExperiment_1.22.0 vsn_3.68.0
## [5] tximport_1.28.0 lubridate_1.9.2
## [7] forcats_1.0.0 stringr_1.5.0
## [9] dplyr_1.1.2 purrr_1.0.1
## [11] readr_2.1.4 tidyr_1.3.0
## [13] tibble_3.2.1 tidyverse_2.0.0
## [15] plotly_4.10.2 pander_0.6.5
## [17] hyperSpec_0.100.0 xml2_1.3.5
## [19] ggplot2_3.4.2 lattice_0.21-8
## [21] here_1.0.1 gplots_3.1.3
## [23] DESeq2_1.40.2 SummarizedExperiment_1.30.2
## [25] Biobase_2.60.0 MatrixGenerics_1.12.2
## [27] matrixStats_1.0.0 GenomicRanges_1.52.0
## [29] GenomeInfoDb_1.36.1 IRanges_2.34.1
## [31] S4Vectors_0.38.1 BiocGenerics_0.46.0
## [33] data.table_1.14.8
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.15.0 jsonlite_1.8.7
## [4] magrittr_2.0.3 farver_2.1.1 rmarkdown_2.23
## [7] zlibbioc_1.46.0 vctrs_0.6.3 memoise_2.0.1
## [10] RCurl_1.98-1.12 htmltools_0.5.5 S4Arrays_1.0.5
## [13] sass_0.4.7 KernSmooth_2.23-22 bslib_0.5.0
## [16] htmlwidgets_1.6.2 testthat_3.1.10 cachem_1.0.8
## [19] lifecycle_1.0.3 pkgconfig_2.0.3 Matrix_1.6-0
## [22] R6_2.5.1 fastmap_1.1.1 GenomeInfoDbData_1.2.10
## [25] digest_0.6.33 colorspace_2.1-0 AnnotationDbi_1.62.2
## [28] rprojroot_2.0.3 crosstalk_1.2.0 RSQLite_2.3.1
## [31] labeling_0.4.2 fansi_1.0.4 timechange_0.2.0
## [34] httr_1.4.6 abind_1.4-5 compiler_4.3.1
## [37] bit64_4.0.5 withr_2.5.0 BiocParallel_1.34.2
## [40] DBI_1.1.3 hexbin_1.28.3 highr_0.10
## [43] DelayedArray_0.26.7 gtools_3.9.4 caTools_1.18.2
## [46] tools_4.3.1 glue_1.6.2 generics_0.1.3
## [49] gtable_0.3.3 tzdb_0.4.0 preprocessCore_1.62.1
## [52] hms_1.1.3 utf8_1.2.3 XVector_0.40.0
## [55] pillar_1.9.0 vroom_1.6.3 limma_3.56.2
## [58] genefilter_1.82.1 softImpute_1.4-1 splines_4.3.1
## [61] survival_3.5-5 bit_4.0.5 deldir_1.0-9
## [64] annotate_1.78.0 tidyselect_1.2.0 locfit_1.5-9.8
## [67] Biostrings_2.68.1 knitr_1.43 edgeR_3.42.4
## [70] xfun_0.39 brio_1.1.3 stringi_1.7.12
## [73] lazyeval_0.2.2 yaml_2.3.7 evaluate_0.21
## [76] codetools_0.2-19 interp_1.1-4 BiocManager_1.30.21.1
## [79] cli_3.6.1 affyio_1.70.0 xtable_1.8-4
## [82] munsell_0.5.0 jquerylib_0.1.4 Rcpp_1.0.11
## [85] png_0.1-8 XML_3.99-0.14 ellipsis_0.3.2
## [88] blob_1.2.4 latticeExtra_0.6-30 jpeg_0.1-10
## [91] bitops_1.0-7 viridisLite_0.4.2 scales_1.2.1
## [94] affy_1.78.2 crayon_1.5.2 rlang_1.1.1
## [97] KEGGREST_1.40.0